library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(broom)
library(magrittr)
library(plotly)
library(RSQLite)
library(reshape2)
library(visNetwork)
library(networkD3)
library(jsonlite)
library(RColorBrewer)
library(gplots)
library(knitr)
library(DT)
library(data.table)
library(d3heatmap)
library(viridis)
library(maps)
library(ggmap)
library(circlize)

rm(list = ls())
Saison <-2012

# Functions ---------------------------------------------------------------


rsplit <- function(x) {
  x <- x[!is.na(x[,1]),,drop=FALSE]
  if(nrow(x)==0) return(NULL)
  if(ncol(x)==1) return(lapply(x[,1], function(v) list(name=v)))
  s <- split(x[,-1, drop=FALSE], x[,1])
  unname(mapply(function(v,n) {if(!is.null(v)) list(name=n, children=v) else list(name=n)}, lapply(s, rsplit), names(s), SIMPLIFY=FALSE))
}


# Connect to data base ----------------------------------------------------
con <- dbConnect(SQLite(), dbname="input/database.sqlite")
# con <- dbConnect(SQLite(), dbname="database160721.sqlite")
# list all tables
# dbListTables(con)

player       <- tbl_df(dbGetQuery(con,"SELECT * FROM player"))
# player_stats <- tbl_df(dbGetQuery(con,"SELECT * FROM player_stats"))
Match        <- tbl_df(dbGetQuery(con,"SELECT * FROM Match"))
Team        <- tbl_df(dbGetQuery(con,"SELECT * FROM Team"))
Country        <- tbl_df(dbGetQuery(con,"SELECT * FROM Country"))
League        <- tbl_df(dbGetQuery(con,"SELECT * FROM League"))

Find connections between different tables

library(visNetwork)
  # this function creates a data.frame with the name of the data.frame and the names of the columns of that data.frame
  create_df_of_names = function(df, name){
    data.frame(from = name, to = names(df))
  }
      list_of_df <- list(player = player,Match = Match, Team = Team, Country = Country, League = League) 

  create_vis_network <- function(list_of_df) {
    # create a names list of the data.frames
    # and map them to build one data.frame with two columns
    # - from contains all  data.frame names
    # - to  contains all column names
    edge <- map2_df(list_of_df,names(list_of_df), create_df_of_names)
    
    # create a visNetwork
    
    nodesFrom <-  edge %>% cbind(unlist(.$from),"Table") %>% select(3,4) %>% data.frame  
    nodesTo <-  edge %>% cbind(unlist(.$to),"Attribute") %>% select(3,4) %>% data.frame 
    
    names(nodesFrom) <- c("id", "group")
    names(nodesTo) <- c("id", "group")
    
    nodes <- rbind(nodesFrom,nodesTo) %>% unique() 
    nodes$id <- as.character((nodes$id))  
    nodes <- nodes %>% unique() %>% arrange(id)
    visNetwork(nodes, edge)%>%
      visOptions(highlightNearest = list(enabled = TRUE, degree = 2), nodesIdSelection = TRUE) %>%
      visEdges(arrows = "to") %>%  
      visGroups(groupname = "Table",     shape = "icon", icon = list(code = "f114", color = "green",size = 75)) %>%
      visGroups(groupname = "Attribute", shape = "icon", icon = list(code = "f115", color = "lightgreen", size = 45)) %>%
      addFontAwesome() 
    # list of icons http://astronautweb.co/snippet/font-awesome/
  }
  
  create_vis_network(list_of_df)

Select relevant columns

player  <- select(player,player_api_id, player_name) # use player_api_id as key for join
Team    <- select(Team, team_api_id, team_long_name, team_short_name) # use team_api_id as key for join
Country <-select(Country, id, name) %>% rename(country_id = id)  %>% rename(country_name = name)   # use country_id as key for join
League  <- select(League, country_id, name) %>% rename(league_name = name) # use country_id as key for join
Match   <-select(Match, id, country_id, league_id, season, stage, date, match_api_id, home_team_api_id, away_team_api_id, home_team_goal, away_team_goal, home_player_1, home_player_2, home_player_3, home_player_4, home_player_5, home_player_6, home_player_7, home_player_8, home_player_9, home_player_10, home_player_11, away_player_1, away_player_2, away_player_3, away_player_4, away_player_5, away_player_6, away_player_7, away_player_8, away_player_9, away_player_10, away_player_11, goal, shoton, shotoff, foulcommit, card, cross, corner, possession)

Join the tables so that all information is available from matchMelt

# melt match data to generate df with player names in one column ----------
matchMelt <-melt(Match,id = c(1:11), measure=c(12:33),na.rm = TRUE, value.name = "player_api_id") %>% 
  mutate(team_api_id=ifelse(grepl("home",variable),home_team_api_id,
                            ifelse(grepl("away",variable),away_team_api_id,NA))) %>%  # get longer names by joining appropriate data.frames
  left_join(Team, by = "team_api_id") %>%
  left_join(player, by = "player_api_id") %>% # add club to each player
  left_join(Country, by = "country_id") %>% # add club to each player
  left_join(League, by = "country_id") %>% # add club to each player
  separate(season, into=c("saisonStart","saisonEnd"),sep = "/", convert = TRUE)  # split saison so it integer
      list_of_df <- list(player = player,matchMelt = matchMelt, Team = Team, Country = Country, League = League) 
  create_vis_network(list_of_df)
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

Built league table in format data.table because the composite key was easier to create with data.table keycols = c(“season”, “league_id”, “home_team_api_id” )

PointsDf <-Match %>% 
  select(1:11)  %>% 
  mutate(homePoint = if_else((home_team_goal > away_team_goal),3,if_else((home_team_goal == away_team_goal),1,0))) %>%
  mutate(awayPoint = if_else((home_team_goal > away_team_goal),0,if_else((home_team_goal == away_team_goal),1,3))) 

# Create home and way tables which will be combined to calculate total points

tableHomeDt <- PointsDf %>% 
  group_by(season, league_id, home_team_api_id) %>%
  summarise(pointsHome = sum(homePoint)) %>%
  ungroup() %>% data.table
  
  keycols = c("season", "league_id", "home_team_api_id" )
setkeyv(tableHomeDt,keycols) 

tableAwayDt <- PointsDf %>% 
  group_by(season, league_id, away_team_api_id) %>%
  summarise(pointsAway = sum(awayPoint)) %>%
  ungroup()  %>% data.table 
  keycols = c("season", "league_id", "away_team_api_id" )
setkeyv(tableAwayDt,keycols) 

# combine the two tables and create ranking
tableHomeAwayDt <- tableHomeDt[tableAwayDt, nomatch=0] %>%
  mutate(points = pointsHome + pointsAway) %>%
  group_by(season, league_id)  %>%
  mutate(rank = min_rank(desc(points)))

tableLong <- tableHomeAwayDt %>% 
  left_join(League, by = c("league_id" = "country_id")) %>%
  left_join(Team, by = c("home_team_api_id" = "team_api_id")) %>%
  ungroup() %>%
  select(season, league_name, rank, team_long_name, points)

Create Transfer data.frame

TransferDf <-matchMelt %>%
  select(player_name, team_long_name, team_short_name, saisonStart, saisonEnd, country_name, league_name)  %>%
  group_by(player_name,team_long_name) %>% # now we have all players with their team
  arrange(saisonStart)  %>% 
  summarise(Player = first(player_name), ClubFirst = min(saisonStart),ClubLast = max(saisonEnd), Country = first(country_name), League = first(league_name)) %>%
# now players first and last game for a club is found
  arrange(ClubFirst) %>%
  mutate(FormerClub = lag(team_long_name)) %>%
  mutate(CurrentClub = team_long_name) %>%
  mutate(FormerLeague = lag(League)) %>%
  mutate(CurrentLeague = League) %>%
  mutate(FormerCountry = lag(Country)) %>%
  mutate(CurrentCountry = Country) %>%
  select(Player, CurrentClub,   FormerClub, ClubFirst,  ClubLast, CurrentLeague, FormerLeague, CurrentCountry, FormerCountry)
## Adding missing grouping variables: `player_name`

Define function for VisNetwork used later

visNetworkPerClub  <- function(matchMelt, Club, Saison)
{
PlayerSelected <- matchMelt  %>%
  filter(saisonStart == Saison) %>% 
  filter(team_long_name == Club) %>%
  select(player_name) %>%
  unique()


edges <- matchMelt %>%
  filter(saisonStart>= Saison) %>%
  filter(player_name %in% PlayerSelected$player_name)  %>%
  select(c(team_long_name,player_name)) %>%
  rename(from = team_long_name)  %>%
  rename(to = player_name) %>%
  unique()  %>% 
  mutate(arrows = c("from"))

edgesMelt <- edges %>%
  mutate(shape = "") %>%
  melt(id = "shape", measure = c("to", "from"), value.name = "id")

nodesClub <- edgesMelt %>%
  filter(variable == "from") %>%
  mutate(group = c("Club"))

nodesPlayer <- edgesMelt %>%
  filter(variable == "to") %>%
  mutate(group = c("Player")) 

nodes <- rbind(nodesClub,nodesPlayer) %>% select(c(variable,id, group)) %>% unique()  

visNetwork(nodes, edges, main = list(text = paste0("Where did the player of ", Club, " play after 2012" ),
 style = "font-family:Comic Sans MS;color:#ff0000;font-size:15px;text-align:center;")) %>%
  visGroups( groupname = "Player", color = "lightgreen") %>%
  visGroups( groupname = "Club", color = "lightblue") %>%
  visOptions(highlightNearest = list(enabled = TRUE, degree =1), nodesIdSelection = FALSE) %>%
  visInteraction(dragNodes = FALSE, dragView = FALSE, zoomView = FALSE)  %>%
  visGroups(groupname = "Club", shape = "icon", icon = list(code = "f1e3", size = 75)) %>%
  visGroups(groupname = "Player", shape = "icon", icon = list(code = "f183", color = "green")) %>%
  addFontAwesome() %>%
  visInteraction(navigationButtons = TRUE) 
}

# Set variable for VisNetwork, when to look at where players go afterwards

The fans stay loyal and the players move on!

Don’t know how you feel, but when I see the players kiss their shirt and then move on to the next club I feel a little bit deluded.
In the following graphs the move of players through the leagues and between the clubs are shown.

  • Lets see how the leagues compare
  • How are the big teams compare?
  • How did their points evolve over time?
  • See who is cashing in by changing the clubs many times

The analysis is based on a kaggle dataset https://www.kaggle.com/hugomathien/soccer

First let us look at the transfers since 2008 in a chord digram. Please note that an D3 based interactive version of chord diagrams is available in the package “chorddiag”, however, this package is not available on Kaggle. Check code to see how to integrate the interactive version.

The chord diagram links the two leagues with an arc, the end of the arc scales with the number of players transferd from that country to the country on the other end of the arc. E.g. see the arc from Portugal to Spain. On the Portugal side the arc is wider than on the Spain side because more players move from Portugal to Spain than vica versa. And since the total number of transfers of the two countries is similar the width of the arc can be compared directly.


Chord diagram of transfers between leagues

TransferMatrix <- na.omit(TransferDf)  %>% ungroup() %>% group_by(FormerLeague, CurrentLeague) %>%
 summarise(sub = n()) %>% ungroup() %>%  na.omit()  %>%
   mutate_each(funs(factor), FormerLeague:CurrentLeague) %>% acast(FormerLeague ~ CurrentLeague, value.var = "sub") # now matrix with leagues on both dimensions

kaggle <- 0


if (kaggle == 0) {
   library(chorddiag)
chorddiag(TransferMatrix)
} else {
   chordDiagram(TransferMatrix)
circos.clear()
}

Lets see how the numbers look like. Italy has almost twice as much transfers as Germany, with the new TV money floating around in England I guess soon there will be an inrease in transfers to England.

Number of transfers between leagues in table format

na.omit(TransferDf)  %>% ungroup() %>% group_by(FormerLeague) %>%
 summarise(NumberOfTransfers = n()) %>% arrange(desc(NumberOfTransfers))  %>% 
  datatable( rownames = FALSE, colnames =c("League", "Number of transfers since 2008") ,options = list(dom = 't', autoWidth = TRUE, columnDefs = list(list(width = '250px', targets = c(1)))))

Italy has the highest volume on transfers, the bulk of it within the league. Surprisingly the Scottish league has the lowest number of transfers.




Number of transfers within leagues in table format

# note "filter_" expects as second argument a logical predicates becuase this time we don't test content of column but if two column have equal content.
na.omit(TransferDf)  %>% ungroup() %>% filter_("FormerLeague==CurrentLeague") %>% group_by(FormerLeague) %>% summarise(NumberOfTransfers = n()) %>% arrange(desc(NumberOfTransfers))  %>% 
  
  datatable( rownames = FALSE, colnames =c("League", "Number of transfers within league since 2008") ,options = list(dom = 't', autoWidth = TRUE,
  columnDefs = list(list(width = '250px', targets = c(1)))))



Other than Spain and Portugal swapping place the same pattern is seen in the in-league transfers.




Where did players of the season 2012 play since then?

Zoom and select nodes to get more insight, navigate with the tabs to the club your are most interested in.
The arrowhead indicates that the player played for that team

  1. Top tabs let you choose the league
  2. Second row tabs let you choose teams within the selected league
  • Check out Rafael from ManU, he played for 5 teams.
  • Guess how many players went from playing for Real to SSC Napoli?

English league


ManU

Club <- "Manchester United"
visNetworkPerClub(matchMelt, Club, Saison)
Arsenal

Club <- "Arsenal"
visNetworkPerClub(matchMelt, Club, Saison)

German league


Bayern

Club <- "Bayern Munich"
visNetworkPerClub(matchMelt, Club, Saison)
Dortmund

Club <- "Borussia Dortmund"
visNetworkPerClub(matchMelt, Club, Saison)
VFB Stuttgart

Club <- "VfB Stuttgart"
visNetworkPerClub(matchMelt, Club, Saison)

Spanish league


Real

Club <- "Real Madrid"
visNetworkPerClub(matchMelt, Club, Saison)
Barca

Club <- "Barcelona"
visNetworkPerClub(matchMelt, Club, Saison)

That was intersting, but how much are certain clubs tied together over the years?




How did it work out for the teams, all tables of all leagues since 2008/2009

Just type in search field “england 2008/2009” to get premiere league table of season 2008/2009, or “bundes 2011/2012” for German Bundesliga of season 2011/2012 and sort for “Rank”. BTW, the table considers only points, not goal difference, gives at times a better feeling about how close the whole thing often is, especially at the bottom of the table.


Interactive table to show league tables of selected leagues and season

datatable(tableLong, rownames = FALSE, colnames =c("Season", "League", "Rank", "Team", "Points"),options = list(
  order = list(list(2, 'asc')), pageLength = 25, search = list(search = 'england 2015/2016')))



Heatmaps of Clubs in Leagues, how many points did they have at the end of the season

Points express better than ranking the strength of a team. After all, ranking is relative, one season the team can be champion with 80 points, next season its only worth 3rd place.

tableLong$points <- as.factor(tableLong$points)
p <- ggplot(filter(tableLong, league_name %in% c("Germany 1. Bundesliga", "England Premier League" )), mapping = aes(x = season, y = team_long_name)) + 
  geom_tile(mapping = aes(fill = points),color="white", size=0.1 ) + facet_grid(league_name~., scales = "free_y") +scale_fill_viridis(discrete=TRUE) + theme(legend.position = "none")  # free y scale to avoid that all clubs are on Y axis in all leagues
ggplotly(p)
TransferRadialCLubs <- function(TransferDf, Club, Saison)
{
TransferRadialLeagues  <- TransferDf %>%
    filter(grepl(Club , FormerClub )) %>%
  filter(ClubFirst >= Saison) %>%
    as.list() %>% as.data.frame(stringsAsFactors = FALSE) %>%
   select(FormerClub, CurrentCountry, CurrentClub, Player) %>%
  arrange(FormerClub, CurrentCountry, CurrentClub, Player) 

TransferRadialList <- rsplit(TransferRadialLeagues)[[1]]
radialNetwork(TransferRadialList, fontSize = 20, height = 700, width = 1000, linkColour = "green", nodeColour = "green", nodeStroke = "lightgreen", textColour = "blue" )  
}



Where did players play since 2012, this time in a radial network just for fun

ManU


Club <- "Manchester United"
TransferRadialCLubs(TransferDf, Club, Saison)

Arsenal

Club <- "Arsenal"
TransferRadialCLubs(TransferDf, Club, Saison)

Bayern

Club <- "Bayern Munich"
TransferRadialCLubs(TransferDf, Club, Saison)

Dortmund

Club <- "Borussia Dortmund"
TransferRadialCLubs(TransferDf, Club, Saison)

VFB Stuttgart

Club <- "VfB Stuttgart"
TransferRadialCLubs(TransferDf, Club, Saison)

Real

Club <- "Real Madrid"
TransferRadialCLubs(TransferDf, Club, Saison)

Barca




Check Transfer dataframe with some players

Always good to check if the data is generated correctly.
Note, the transfers were determined by the date the player played for the club, therefore there is a slight discrepancy.

Lewandowski
2006–2008 Znicz Pruszków
2008–2010 Lech Poznań
2010–2014 Borussia Dortmund
2014– Bayern Munich

Rene Adler
2006–2012 Bayer Leverkusen
2012– Hamburger SV

# check with Lewandowski


 TransferDf %>% filter(grepl("Robert Lewandowski" ,Player )) %>% select(-Player)  %>% kable()
player_name CurrentClub FormerClub ClubFirst ClubLast CurrentLeague FormerLeague CurrentCountry FormerCountry
Robert Lewandowski Lech Poznan NA 2008 2010 Poland Ekstraklasa NA Poland NA
Robert Lewandowski Borussia Dortmund Lech Poznan 2010 2014 Germany 1. Bundesliga Poland Ekstraklasa Germany Poland
Robert Lewandowski Bayern Munich Borussia Dortmund 2014 2016 Germany 1. Bundesliga Germany 1. Bundesliga Germany Germany
 TransferDf %>% filter(grepl("Adler" ,Player ))  %>% select(-Player)  %>% kable()
player_name CurrentClub FormerClub ClubFirst ClubLast CurrentLeague FormerLeague CurrentCountry FormerCountry
Rene Adler Bayer Leverkusen NA 2008 2011 Germany 1. Bundesliga NA Germany NA
Rene Adler Hamburger SV Bayer Leverkusen 2012 2016 Germany 1. Bundesliga Germany 1. Bundesliga Germany Germany

to does

  1. create gif animation of radials to show in markdown doc